
### -------------------------------------------------- 
### ---- General Set Up ---- 
### -------------------------------------------------- 

### Clear global environment
rm(list=ls()) 


### Libraries:
library(pacman)
p_load(tidyverse, foreign, readstata13, readr, magrittr, here)


### File Location (R project only)
here::i_am("Survey and App Match/106_subset_and merge_state_vars.R")
file_path <- here()
file_path_data_matched <- here("Survey Only Match/Temp_Data/")

### -------------------------------------------------- 
### ---- Download TFA Data ----
### -------------------------------------------------- 

### Download Data
tfa_dat <- read.dta13(paste0(file_path,"/Data/tfa_data.dta")) #changed in survey_only folder


### -------------------------------------------------- 
### ---- Download Data Matched to State ----
### -------------------------------------------------- 

load(paste0(file_path_data_matched,"/tfa_to_state_address.RData"))
load(paste0(file_path_data_matched,"/tfa_to_state_uniname.RData"))
load(paste0(file_path_data_matched,"/tfa_to_state_matreg.RData"))
load(paste0(file_path_data_matched,"/tfa_to_state_survey.RData")) 
load(paste0(file_path_data_matched,"/tfa_to_state_phone.RData"))


### -------------------------------------------------- 
### ---- Combine Data Matched to State ----
### -------------------------------------------------- 

### -------------------------------------------------- 
### Source 1: Current & Permanant Addresses
### -------------------------------------------------- 
tfa_dat_address_NewVars <- tfa_dat_address_FINAL %>%
  select(-c(firstname:PhysicalZipCode)) %>%
  mutate(state_pa = as.character(state_pa_clean),
         state_ca = as.character(state_ca))


### -------------------------------------------------- 
### Source 2: University Names
### -------------------------------------------------- 
tfa_dat_unimatch_NewVars <- tfa_dat_unimatch_FINAL %>%
  select(personid, state_uni, inst_multi, campus_multi, 
         no_campuses, uni_state_source) 


### -------------------------------------------------- 
### Source 3: Matriculation Region
### -------------------------------------------------- 
tfa_dat_matregmatch_NewVars <- tfa_dat_matregmatch_FINAL %>%
  select(personid, state_mr)

### -------------------------------------------------- 
### Source 4: Zip Codes
### -------------------------------------------------- 
tfa_dat_zipcodematch_NewVars <- tfa_dat_zipcodematch_FINAL %>%
  select(personid, state_st, state_survey) #removed city 


### -------------------------------------------------- 
### Source 5: Phone Area Codes
### -------------------------------------------------- 
tfa_dat_acmatch_NewVars <- tfa_dat_acmatch_FINAL %>%
  select(personid, FINAL_state_phone, FINAL_state_phone_alt, state_ac_cell, state_ac_home) %>%
  rename(state_ac1 = FINAL_state_phone) %>%
  rename(state_ac2 = FINAL_state_phone_alt) %>%
  rename(state_cell = state_ac_cell, #added 8/18 
         state_home = state_ac_home)


### -------------------------------------------------- 
### Join New Variables from Each Source
### -------------------------------------------------- 

### Create matched dataset
tfa_dat_match <- tfa_dat

### Join TFA and source 1 (Address)
tfa_dat_match <- left_join(x = tfa_dat_match,
                           y = tfa_dat_address_NewVars,
                           by = "personid") 

### Join TFA and source 2 (University Names)
tfa_dat_match <- left_join(x = tfa_dat_match,
                           y = tfa_dat_unimatch_NewVars,
                           by = "personid") 

### Join TFA and source 3 (Zip Codes)
tfa_dat_match <- left_join(x = tfa_dat_match,
                           y = tfa_dat_zipcodematch_NewVars,
                           by = "personid") 

### Join TFA and source 4 (Matriculation Region)
tfa_dat_match <- left_join(x = tfa_dat_match,
                           y = tfa_dat_matregmatch_NewVars,
                           by = "personid") 

### Join TFA and source 5 (Phone Area Codes)
tfa_dat_match <- left_join(x = tfa_dat_match,
                           y = tfa_dat_acmatch_NewVars,
                           by = "personid") 

### Look at Combined Data
tfa_dat_match %>% select(state_ca, state_pa, 
                         state_uni, 
                         state_st, 
                         state_mr, 
                         state_ac1, state_ac2,
                         state_cell, state_home) %>%
  head(15) #view subset of vars


### -------------------------------------------------- 
### ---- Check Unmatched ---- 
### -------------------------------------------------- 

# Few applicants not linked to a state
tfa_dat_match %>% 
  filter(is.na(state_ca) & 
           is.na(state_pa) & 
           is.na(state_uni) & 
           is.na(state_st) & 
           is.na(state_mr) & 
           is.na(state_ac1) & 
           is.na(state_ac2))


### -------------------------------------------------- 
### ---- Merge w. Citizenship Data ----
### -------------------------------------------------- 

### Load citizenship variable
tfa_citizenship <- read_csv(paste0(file_path, "/Data/2007.2015 Apps Submitted Citizen Status.csv")) %>%
  mutate(personid = as.character(personid)) #convert type

### Merge with main dataset
tfa_dat_match %<>% left_join(x=., y=tfa_citizenship, by=c("personid"))

### -------------------------------------------------- 
### ---- Subset ----
### -------------------------------------------------- 

sum_na <- function(x) sum(is.na(x))
tfa_dat_match %>%
  mutate(uscitizen = ifelse(uscitizen==1, uscitizen, NA)) %>%
  summarise_at(vars(state_st, pre3_0, uscitizen), sum_na)

tfa_dat_match %>%
  mutate_at(vars(state_st, pre3_0, uscitizen), is.na) %>%
  mutate(degreedate = ifelse(degreedate=="", 1, 0)) %>%
  group_by(state_st, pre3_0, uscitizen, degreedate) %>% summarise(n=n())

### Check row count
tfa_dat_match %>% nrow() #obs: 120,417 

### Remove anyone who isn't a citizen
tfa_dat_match %<>% filter(uscitizen==1)
# Check row count
tfa_dat_match %>% nrow() #obs: 117,456

### Remove anyone without survey response for STATE
tfa_dat_match %<>% filter(!is.na(state_st))


### -------------------------------------------------- 
### ---- Saved Matched Data ---- 
### -------------------------------------------------- 

save(tfa_dat_match, 
     file=paste0(file_path_data_matched,"/tfa_to_state_ALL_v2.RData")) #changed name 


